BEHAVIOURAL SEGMENTATION ¶

LOAD THE NECESSARY LIBRARIES ¶

import pandas as pd
import numpy as np
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from plotly.offline import iplot
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_score
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score
import ipywidgets as widgets
from keras.models import load_model
from ipywidgets import *

print("Libraries loaded!")
Libraries loaded!
# !jupyter nbextension enable – py widgetsnbextension – sys-prefix
# !jupyter serverextension enable voila – sys-prefix

DATA UNDERSTANDING¶

A. Load data¶

print()
print("Loading data.....")
customer_data = pd.read_csv('E-Commerce_Data.csv', encoding='ISO-8859-1',dtype={'InvoiceID': str})
customer_data['InvoiceDate'] = pd.to_datetime(customer_data['InvoiceDate']) #convert to python datetime object
print("Data loaded!")
Loading data.....
Data loaded!

B. Summary of data set¶

print()
def check_data(dataframe):
    print(" *********************************SHAPE******************************")
    print(dataframe.shape)
    print()
    print()
    print("*********************************COLUMNS******************************")
    print(dataframe.columns)
    print()
    print()
    print("**********************************TYPES*******************************")
    print(dataframe.dtypes)
    print()
    print()
    print("**********************************HEAD*******************************")
    print(dataframe.head())
    print()
    print()
    print("**********************************TAIL*******************************")
    print(dataframe.tail())
    print()
    print()
    print("*******************************DESCRIPTION***************************")
    print(dataframe.describe().T)
    print()
    
    
check_data(customer_data)
 *********************************SHAPE******************************
(541909, 8)


*********************************COLUMNS******************************
Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')


**********************************TYPES*******************************
InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object


**********************************HEAD*******************************
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  


**********************************TAIL*******************************
       InvoiceNo StockCode                      Description  Quantity  \
541904    581587     22613      PACK OF 20 SPACEBOY NAPKINS        12   
541905    581587     22899     CHILDREN'S APRON DOLLY GIRL          6   
541906    581587     23254    CHILDRENS CUTLERY DOLLY GIRL          4   
541907    581587     23255  CHILDRENS CUTLERY CIRCUS PARADE         4   
541908    581587     22138    BAKING SET 9 PIECE RETROSPOT          3   

               InvoiceDate  UnitPrice  CustomerID Country  
541904 2011-12-09 12:50:00       0.85     12680.0  France  
541905 2011-12-09 12:50:00       2.10     12680.0  France  
541906 2011-12-09 12:50:00       4.15     12680.0  France  
541907 2011-12-09 12:50:00       4.15     12680.0  France  
541908 2011-12-09 12:50:00       4.95     12680.0  France  


*******************************DESCRIPTION***************************
               count          mean          std       min       25%       50%  \
Quantity    541909.0      9.552250   218.081158 -80995.00      1.00      3.00   
UnitPrice   541909.0      4.611114    96.759853 -11062.06      1.25      2.08   
CustomerID  406829.0  15287.690570  1713.600303  12346.00  13953.00  15152.00   

                 75%      max  
Quantity       10.00  80995.0  
UnitPrice       4.13  38970.0  
CustomerID  16791.00  18287.0  

C. Check for missing values.¶

Get the total number of missing values for each attribute¶
#Get the total number of missing values for each attribute
print()
print(customer_data.isnull().sum())
print()
InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

D. Check for duplicated entries¶

#Check for repeated rows
print()
print("Duplicated values:", customer_data.duplicated().sum())
print()
Duplicated values: 5268

E. Countries¶

#Get the unique number of countries within the data set
print()
temp = customer_data.groupby(['CustomerID', 'InvoiceNo', 'Country']).count()
#temp = customer_data.groupby(['Country']).count()
temp = temp.reset_index(drop = False)
countries = temp['Country'].value_counts()
print('Number of countries in the dataframe: {}' .format(len(countries)))
Number of countries in the dataframe: 37
#Visualize the total number of orders for different countries
data = dict(type='choropleth',
locations = countries.index,
locationmode = 'country names', z = countries,
text = countries.index, colorbar = {'title':'Order nb.'},
colorscale=[[0, 'rgb(224,255,255)'],
            [0.01, 'rgb(166,206,227)'], [0.02, 'rgb(31,120,180)'],
            [0.03, 'rgb(178,223,138)'], [0.05, 'rgb(51,160,44)'],
            [0.10, 'rgb(251,154,153)'], [0.20, 'rgb(255,255,0)'],
            [1, 'rgb(227,26,28)']],    
reversescale = False)
#_______________________
layout = dict(title='Number of orders per country',
geo = dict(showframe = True, projection={'type':'mercator'}))
#______________
choromap = go.Figure(data = [data], layout = layout)
iplot(choromap, validate=False)
print()

F. Customers and products¶

Total number of customers, products and transactions¶
#Total number of customers, products and transactions
pd.DataFrame([{'products': len(customer_data['StockCode'].value_counts()),    
               'transactions': len(customer_data['InvoiceNo'].value_counts()),
               'customers': len(customer_data['CustomerID'].value_counts()),  
              }], columns = ['products', 'transactions', 'customers'], index = ['quantity'])
products transactions customers
quantity 4070 25900 4372
Number of products purchased in every transaction¶
#Number of products purchased in every transaction
print()
temp = customer_data.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate'].count()
no_products_per_basket = temp.rename(columns = {'InvoiceDate':'Number of products'})
no_products_per_basket[:10].sort_values('CustomerID')

CustomerID InvoiceNo Number of products
0 12346.0 541431 1
1 12346.0 C541433 1
2 12347.0 537626 31
3 12347.0 542237 29
4 12347.0 549222 24
5 12347.0 556201 18
6 12347.0 562032 22
7 12347.0 573511 47
8 12347.0 581180 11
9 12348.0 539318 17

G. Stock codes¶

List of stock codes in the data set¶
particular_code_list = customer_data[customer_data['StockCode'].str.contains(
    '^[a-zA-Z]+', regex=True)]['StockCode'].unique()
particular_code_list
array(['POST', 'D', 'C2', 'DOT', 'M', 'BANK CHARGES', 'S', 'AMAZONFEE',
       'DCGS0076', 'DCGS0003', 'gift_0001_40', 'DCGS0070', 'm',
       'gift_0001_50', 'gift_0001_30', 'gift_0001_20', 'DCGS0055',
       'DCGS0072', 'DCGS0074', 'DCGS0069', 'DCGS0057', 'DCGSSBOY',
       'DCGSSGIRL', 'gift_0001_10', 'PADS', 'DCGS0004', 'DCGS0073',
       'DCGS0071', 'DCGS0068', 'DCGS0067', 'DCGS0066P', 'B', 'CRUK'],
      dtype=object)
Description of stock code 'B'¶
particular_code_description = customer_data[customer_data['StockCode'] == 'B']
particular_code_description
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
299982 A563185 B Adjust bad debt 1 2011-08-12 14:50:00 11062.06 NaN United Kingdom
299983 A563186 B Adjust bad debt 1 2011-08-12 14:51:00 -11062.06 NaN United Kingdom
299984 A563187 B Adjust bad debt 1 2011-08-12 14:52:00 -11062.06 NaN United Kingdom

DATA PREPARATION¶

A. Dealing with missing values¶

Filling the missing values¶
#Fill the missing values
customer_data['CustomerID'].fillna(inplace = True,value=customer_data['CustomerID'].mean())
customer_data['Description'].fillna(inplace = True, value='')
print("Done!")
Done!
Check if there are still missing values¶
#Check if there are still missing values
customer_data.isnull().sum()
InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

B. Dealing with repeated entries¶

Remove duplicated entries¶
#remove duplicated rows
customer_data.drop_duplicates(inplace=True)
print("Done!")
Done!
Check if there are still repeated entries¶
#Check if there are still duplicates
print("Repeated entries:",customer_data.duplicated().sum())
Repeated entries: 0
print("Length of data set now", len(customer_data))
Length of data set now 536641

C. Remove bad orders¶

Gather all orders that might indicate a cancelled order¶
#Gather all orders that might indicate a cancelled order
cancelledOrders=customer_data[customer_data["Quantity"]<0]
cancelledOrders.head()
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
141 C536379 D Discount -1 2010-12-01 09:41:00 27.50 14527.0 United Kingdom
154 C536383 35004C SET OF 3 COLOURED FLYING DUCKS -1 2010-12-01 09:49:00 4.65 15311.0 United Kingdom
235 C536391 22556 PLASTERS IN TIN CIRCUS PARADE -12 2010-12-01 10:24:00 1.65 17548.0 United Kingdom
236 C536391 21984 PACK OF 12 PINK PAISLEY TISSUES -24 2010-12-01 10:24:00 0.29 17548.0 United Kingdom
237 C536391 21983 PACK OF 12 BLUE PAISLEY TISSUES -24 2010-12-01 10:24:00 0.29 17548.0 United Kingdom
Check if negative quantity correspond to cancelled transaction¶
#Check if negative quantity correspond to cancelled transaction
display(customer_data.sort_values('CustomerID')[:5])
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
61619 541431 23166 MEDIUM CERAMIC TOP STORAGE JAR 74215 2011-01-18 10:01:00 1.04 12346.0 United Kingdom
61624 C541433 23166 MEDIUM CERAMIC TOP STORAGE JAR -74215 2011-01-18 10:17:00 1.04 12346.0 United Kingdom
428981 573511 22992 REVOLVER WOODEN RULER 12 2011-10-31 12:25:00 1.95 12347.0 Iceland
429001 573511 20719 WOODLAND CHARLOTTE BAG 10 2011-10-31 12:25:00 0.85 12347.0 Iceland
429002 573511 23162 REGENCY TEA STRAINER 8 2011-10-31 12:25:00 3.75 12347.0 Iceland
Check if all negative quantities correspond to cancelled orders¶
#Inspect if all orders with negative quantity indicate cancelled orders
data_check = customer_data[customer_data['Quantity']<0][['CustomerID', 'Quantity', 'StockCode', 'Description', 'UnitPrice']]
for index, col in data_check.iterrows():
    if customer_data[(customer_data['CustomerID'] == col[0]) & (customer_data['Quantity'] == -col[1]) & 
            (customer_data['Description'] == col[2])].shape[0] == 0:
        print(data_check.loc[index])
        print('Hypothesis NOT fulfilled')
        break
CustomerID      14527.0
Quantity             -1
StockCode             D
Description    Discount
UnitPrice          27.5
Name: 141, dtype: object
Hypothesis NOT fulfilled
Check if all non discount negative quantities correspond to cancelled orders¶
#Perform the previous check but ignore discount. 
data_check = customer_data[(customer_data['Quantity']<0) & (customer_data['Description'] != 'Discount')][['CustomerID', 'Quantity', 'StockCode', 'Description', 'UnitPrice']]
for index, col in data_check.iterrows():
    if customer_data[(customer_data['CustomerID'] == col[0]) & (customer_data['Quantity'] == -col[1]) & 
            (customer_data['Description'] == col[2])].shape[0] == 0:
        print(data_check.loc[index])
        print('HYPOTHESIS not fulfilled')
        break
CustomerID                             15311.0
Quantity                                    -1
StockCode                               35004C
Description    SET OF 3 COLOURED  FLYING DUCKS
UnitPrice                                 4.65
Name: 154, dtype: object
HYPOTHESIS not fulfilled
Collect all transactions that relate to cancelled transactions¶
#Gather all entries that relate to cancelled orders and store the quantity cancelled for each cancelled order
data_cleaned =  customer_data.copy(deep =True)
data_cleaned['QuantityCanceled'] = 0

entry_to_remove = []
doubtful_entry = []

for index, col in customer_data.iterrows():
    if (col['Quantity']> 0) or col['Description'] == 'Discount': 
        continue
    data_test = customer_data[(customer_data['CustomerID'] == col['CustomerID']) & (customer_data['StockCode'] == col['StockCode'])
                  & (customer_data['InvoiceDate'] < col['InvoiceDate']) & (customer_data['Quantity'] > 0)].copy()
    #**********************************************
    #Cancellation without counterpart
    if (data_test.shape[0] == 0):
        doubtful_entry.append(index)
        
    #Cancelation with a counterpart
    elif (data_test.shape[0] == 1):
        counterpart_index = data_test.index[0]
        data_cleaned.loc[counterpart_index, 'QuantityCanceled'] = -col['Quantity']
        entry_to_remove.append(index)
        
    #Entries with several counterparts. We delete the last one
    elif (data_test.shape[0]>1):
        data_test.sort_index(axis=0, ascending=False, inplace=True)
        for ind, val in data_test.iterrows():
            if val['Quantity'] < -col['Quantity']: continue
            data_cleaned.loc[ind, 'QuantityCanceled'] = -col['Quantity']
            entry_to_remove.append(index)
            break
print("Length of cancelled orders without a counterpart and ones with one or more counterpart:", len(doubtful_entry + entry_to_remove))
Length of cancelled orders without a counterpart and ones with one or more counterpart: 9771
Remove all entries that do not have a counterpart and ones that have atleast one counterpart¶
#Remove entries that do not have a counterpart and ones that have atleast one counterpart
data_cleaned.drop(entry_to_remove, axis=0, inplace=True)
data_cleaned.drop(doubtful_entry, axis=0, inplace=True)
print("Done!")
Done!
Remove remaining non discount negative entries¶
#Check for entries that have  negative quantity
remaining_entries = data_cleaned[(data_cleaned['Quantity']<0) & (data_cleaned['StockCode']!='D')]
print("Remaining entries to delete: {}".format(remaining_entries.shape[0]))
remaining_entries[:5]
Remaining entries to delete: 739
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country QuantityCanceled
7188 536996 22712 -20 2010-12-03 15:30:00 0.0 15287.69057 United Kingdom 0
7201 537009 84534B -80 2010-12-03 15:38:00 0.0 15287.69057 United Kingdom 0
7202 537010 22162 -40 2010-12-03 15:38:00 0.0 15287.69057 United Kingdom 0
7205 537013 35965 -25 2010-12-03 15:40:00 0.0 15287.69057 United Kingdom 0
7291 537027 18098C -140 2010-12-03 16:36:00 0.0 15287.69057 United Kingdom 0
#Remove remaining supsicious entries
data_cleaned.drop(remaining_entries.index,axis=0,inplace=True)
print("Done!")
Done!
Check if there are still negative non-discount entries to delete¶
print("Number of entries to delete: {}".format(data_cleaned[(data_cleaned['Quantity']<0) & (data_cleaned['StockCode']!='D')].shape[0]))
remaining_entries[:5]
print("Length of data frame now: {}", len(data_cleaned))
Number of entries to delete: 0
Length of data frame now: {} 526131

D. Basket price¶

Compute the total amount for each entry¶
#Compute total amount for each entry
data_cleaned['TotalPrice'] = data_cleaned['UnitPrice'] * (data_cleaned['Quantity'] - data_cleaned['QuantityCanceled'])
data_cleaned['TotalQuantity'] = data_cleaned['Quantity'] - data_cleaned['QuantityCanceled']
data_cleaned.sort_values('CustomerID')[:5]
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country QuantityCanceled TotalPrice TotalQuantity
61619 541431 23166 MEDIUM CERAMIC TOP STORAGE JAR 74215 2011-01-18 10:01:00 1.04 12346.0 United Kingdom 74215 0.00 0
535014 581180 23508 MINI PLAYING CARDS DOLLY GIRL 20 2011-12-07 15:52:00 0.42 12347.0 Iceland 0 8.40 20
535011 581180 21265 PINK GOOSE FEATHER TREE 60CM 12 2011-12-07 15:52:00 1.95 12347.0 Iceland 0 23.40 12
14968 537626 20782 CAMOUFLAGE EAR MUFF HEADPHONES 6 2010-12-07 14:57:00 5.49 12347.0 Iceland 0 32.94 6
286621 562032 23308 SET OF 60 VINTAGE LEAF CAKE CASES 24 2011-08-02 08:48:00 0.55 12347.0 Iceland 0 13.20 24
Compute the total amount for every transaction¶
#Sum of purchases/ user & order
order_total_df = data_cleaned.groupby(by= ['CustomerID', 'InvoiceNo', 'InvoiceDate'], as_index=False)['TotalPrice', 'TotalQuantity'].sum()
basket_price = order_total_df.rename(columns= {'TotalPrice' :'Basket Price'})

#Selection of important entries
basket_price=basket_price[basket_price['Basket Price']>0]
basket_price.sort_values('CustomerID')[:6]
C:\Users\glori\AppData\Local\Temp\ipykernel_22296\2682052397.py:2: FutureWarning:

Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.

CustomerID InvoiceNo InvoiceDate Basket Price TotalQuantity
1 12347.0 537626 2010-12-07 14:57:00 711.79 319
2 12347.0 542237 2011-01-26 14:30:00 475.39 315
3 12347.0 549222 2011-04-07 10:43:00 636.25 483
4 12347.0 556201 2011-06-09 13:01:00 382.52 196
5 12347.0 562032 2011-08-02 08:48:00 584.91 277
6 12347.0 573511 2011-10-31 12:25:00 1294.32 676
Visualize the amount spent within transactions¶
basket_price.loc[basket_price['Basket Price'].idxmax()]
price_range = [0, 50, 100, 200, 500, 1000, 5000, 50000]
count_price = []
for i, price in enumerate(price_range):
  if i==0: continue
  items_count = basket_price[(basket_price['Basket Price'] < price) & 
                     (basket_price['Basket Price']>price_range[i-1])]['Basket Price'].count()
  count_price.append(items_count)
#*******************************************************************
#Representation of the purchases amount
plt.rc('font', weight='bold')
f, ax = plt.subplots(figsize=(10,7))
labels = ['{}<.<{}'.format(price_range[i-1], s) for i,s in enumerate(price_range) if i!=0]
sizes = count_price
explode = [0.0 if sizes[i] < 100 else 0.0 for i in range(len(sizes))]
ax.pie(sizes, explode = explode, labels=labels, autopct = lambda x:'{:1.0f}%'.format(x)
if x>1 else '', shadow =False, startangle=0)
ax.axis('equal')
f.text(0.5, 1.01, "Representation of the purchases amount",
       ha = 'center', fontsize = 18);

E. RFM analysis¶

Compute how recent a customer performed a transaction¶
#Compute how recent a single customer performed a transaction
df_recency = basket_price.groupby(by='CustomerID',
                        as_index=False)['InvoiceDate'].max()
df_recency.columns = ['CustomerID', 'LastPurchaseDate']
recent_date = df_recency['LastPurchaseDate'].max()
df_recency['Recency'] = df_recency['LastPurchaseDate'].apply(
    lambda x: (recent_date - x).days)
df_recency
CustomerID LastPurchaseDate Recency
0 12347.0 2011-12-07 15:52:00 1
1 12348.0 2011-09-25 13:13:00 74
2 12349.0 2011-11-21 09:51:00 18
3 12350.0 2011-02-02 16:01:00 309
4 12352.0 2011-11-03 14:37:00 35
... ... ... ...
4323 18280.0 2011-03-07 09:52:00 277
4324 18281.0 2011-06-12 10:53:00 180
4325 18282.0 2011-12-02 11:43:00 7
4326 18283.0 2011-12-06 12:02:00 3
4327 18287.0 2011-10-28 09:29:00 42

4328 rows × 3 columns

Compute the number of times a customer has made a transaction¶
#Compute the number of times a customer has made a transaction
frequency_df = basket_price.groupby(
    by=['CustomerID'], as_index=False)['InvoiceDate'].count()
frequency_df.columns = ['CustomerID', 'Frequency']
frequency_df.head()
CustomerID Frequency
0 12347.0 7
1 12348.0 4
2 12349.0 1
3 12350.0 1
4 12352.0 7
Compute the total amount spent by a customer ¶
#Compute the total amount spent by a customer within the given time. 
monetary_df = basket_price.groupby(by='CustomerID', as_index=False)['Basket Price'].sum()
monetary_df.columns = ['CustomerID', 'Monetary']
monetary_df.head()
CustomerID Monetary
0 12347.0 4310.00
1 12348.0 1797.24
2 12349.0 1757.55
3 12350.0 334.40
4 12352.0 2385.71
Generate the RFM data set¶
#Generate the RFM data set
rf_df = df_recency.merge(frequency_df, on='CustomerID')
rfm_df = rf_df.merge(monetary_df, on='CustomerID').drop(
    columns='LastPurchaseDate')
rfm_df.head()
CustomerID Recency Frequency Monetary
0 12347.0 1 7 4310.00
1 12348.0 74 4 1797.24
2 12349.0 18 1 1757.55
3 12350.0 309 1 334.40
4 12352.0 35 7 2385.71

F. Outlier detection¶

plt.figure(figsize=(12,12))
plt.title("RFM variables distribution")
rfm_df.boxplot()
<AxesSubplot:title={'center':'RFM variables distribution'}>

G. Data Scaling¶

#Data normalization
print("Scaling data....")
rfm_normalized = rfm_df[['Monetary', 'Frequency', 'Recency']]

scaler = StandardScaler()

scaler.fit(rfm_normalized)

rfm_normalized = pd.DataFrame(scaler.transform(rfm_normalized), columns=rfm_normalized.columns)
print("Done!")
Scaling data....
Done!
Scaled data¶
rfm_normalized
Monetary Frequency Recency
0 0.072829 0.107094 -0.905229
1 -0.019727 -0.025766 -0.176468
2 -0.021189 -0.158625 -0.735517
3 -0.073610 -0.158625 2.169545
4 0.001949 0.107094 -0.565806
... ... ... ...
4323 -0.079276 -0.158625 1.850088
4324 -0.082951 -0.158625 0.881734
4325 -0.079423 -0.114339 -0.845331
4326 -0.010582 0.505672 -0.885263
4327 -0.018253 -0.070052 -0.495925

4328 rows × 3 columns

Check for outliers after scaling¶
#Check for outliers after scaling 
plt.figure(figsize=(12,12))
plt.title("Outlier variable distribution")
rfm_normalized.boxplot()
<AxesSubplot:title={'center':'Outlier variable distribution'}>
Outlier Information¶
#Outlier information
rfm_normalized[rfm_normalized['Monetary']>60]
Monetary Frequency Recency
2153 62.558306 61.886723 -0.915212
Remove outlier information¶
#Remove outlier information
print()
indexID = rfm_normalized[rfm_normalized['Monetary'] > 60].index
rfm_normalized.drop(indexID, inplace=True)
print("Done!")
Done!
Recheck the distribution of data¶
# Recheck the distribution of data
plt.figure(figsize=(12,12))
plt.title("RFM variable distribution")
rfm_normalized.boxplot()
<AxesSubplot:title={'center':'RFM variable distribution'}>

MODELLING¶

A. Clustering.¶

Elbow method¶
#Compute the optimal numer of clusters by 
inertia = []
num_clusters =range(2,8)
for i in num_clusters:
    kmeans=KMeans(n_clusters=i, max_iter=50)
    kmeans.fit(rfm_normalized)
    inertia.append(kmeans.inertia_)
    
plt.figure(figsize=(16,8))
plt.plot(num_clusters, inertia, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('inertia')
plt.title('The Elbow Method showing the optimal k')
plt.show()
print()

Silhouette analysis¶
#Silhouette analysis
print("Calculating silhouette scores for different values of k.....")
for k in num_clusters:
    #initialize kmeans
    kmeans = KMeans(n_clusters=k, max_iter=50)
    kmeans.fit(rfm_normalized)
    cluster_labels = kmeans.labels_
    
    #silhouette score
    silhouette_avg = silhouette_score(rfm_normalized, cluster_labels)
    print("For n_clusters={0}, the silhouette score is {1}".format(k, silhouette_avg))
Calculating silhouette scores for different values of k.....
For n_clusters=2, the silhouette score is 0.6873406120911523
For n_clusters=3, the silhouette score is 0.5845143260289865
For n_clusters=4, the silhouette score is 0.6030597736942671
For n_clusters=5, the silhouette score is 0.492318872510545
For n_clusters=6, the silhouette score is 0.5180847330599242
For n_clusters=7, the silhouette score is 0.5110443562977901
#Visulaization of silhoutte analysis 
print("Visulization of silhouette scores against clustered data for different values of k")
for i, k in enumerate([2, 3, 4, 5, 6]):
    fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(15,5))
    #fig.set_size_inches(15, 5)
    
    # Run the Kmeans algorithm
    km=KMeans(n_clusters=k)
    labels = km.fit_predict(rfm_normalized)
    centroids = km.cluster_centers_
    

    # Get silhouette samples
    silhouette_vals = silhouette_samples(rfm_normalized, labels)

    # Silhouette plot
    y_ticks = []
    y_lower, y_upper = 0, 0
    for i, cluster in enumerate(np.unique(labels)):
        cluster_silhouette_vals = silhouette_vals[labels == cluster]
        cluster_silhouette_vals.sort()
        y_upper += len(cluster_silhouette_vals)
        ax1.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1)
        ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
        y_lower += len(cluster_silhouette_vals)

    # Get the average silhouette score and plot it
    avg_score = np.mean(silhouette_vals)
    ax1.axvline(avg_score, linestyle='--', linewidth=2, color='green')
    ax1.set_yticks([])
    ax1.set_xlim([-0.1, 1])
    ax1.set_xlabel('Silhouette coefficient values')
    ax1.set_ylabel('Cluster labels')
    ax1.set_title('Silhouette plot for the various clusters', y=1.02);
    
    # Scatter plot of data colored with labels
    ax2.scatter(rfm_normalized['Monetary'], rfm_normalized['Recency'], c=labels)
    ax2.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='r', s=250)
    # ax2.set_xlim([-2, 2])
    # ax2.set_xlim([-2, 2])
    ax2.set_xlabel('Monetary value')
    ax2.set_ylabel('Recency')
    ax2.set_title('Visualization of clustered data', y=1.02)
    ax2.set_aspect('equal')
    plt.tight_layout()
    plt.suptitle(f'Silhouette analysis using k = {k}',
                 fontsize=16, fontweight='semibold', y=1.05);
Visulization of silhouette scores against clustered data for different values of k
Segmentation¶
#Segmentation of customers
kmeans = KMeans(n_clusters=4, max_iter=50)
kmeans.fit(rfm_normalized)
KMeans(max_iter=50, n_clusters=4)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(max_iter=50, n_clusters=4)
Add CustomerID column¶
rfm_normalized.loc[:, 'CustomerID'] = rfm_df['CustomerID']
rfm_normalized
Monetary Frequency Recency CustomerID
0 0.072829 0.107094 -0.905229 12347.0
1 -0.019727 -0.025766 -0.176468 12348.0
2 -0.021189 -0.158625 -0.735517 12349.0
3 -0.073610 -0.158625 2.169545 12350.0
4 0.001949 0.107094 -0.565806 12352.0
... ... ... ... ...
4323 -0.079276 -0.158625 1.850088 18280.0
4324 -0.082951 -0.158625 0.881734 18281.0
4325 -0.079423 -0.114339 -0.845331 18282.0
4326 -0.010582 0.505672 -0.885263 18283.0
4327 -0.018253 -0.070052 -0.495925 18287.0

4327 rows × 4 columns

Label every customers¶
rfm_normalized['Cluster'] = kmeans.labels_
rfm_normalized
Monetary Frequency Recency CustomerID Cluster
0 0.072829 0.107094 -0.905229 12347.0 1
1 -0.019727 -0.025766 -0.176468 12348.0 1
2 -0.021189 -0.158625 -0.735517 12349.0 1
3 -0.073610 -0.158625 2.169545 12350.0 0
4 0.001949 0.107094 -0.565806 12352.0 1
... ... ... ... ... ...
4323 -0.079276 -0.158625 1.850088 18280.0 0
4324 -0.082951 -0.158625 0.881734 18281.0 3
4325 -0.079423 -0.114339 -0.845331 18282.0 1
4326 -0.010582 0.505672 -0.885263 18283.0 1
4327 -0.018253 -0.070052 -0.495925 18287.0 1

4327 rows × 5 columns

2D Distribution of customers within clusters¶
rfm_normalized['Cluster'].value_counts()
1    2868
3     810
0     626
2      23
Name: Cluster, dtype: int64
Analyze the customer groups¶
plt.figure(figsize=(12,12))
plt.title("Monetary variable distribution within each cluster")
sns.boxplot(x='Cluster', y='Monetary', data=rfm_normalized)
<AxesSubplot:title={'center':'Monetary variable distribution within each cluster'}, xlabel='Cluster', ylabel='Monetary'>
plt.figure(figsize=(12,12))
plt.title("Frequency variable distribution within each cluster")
sns.boxplot(x='Cluster', y='Frequency', data=rfm_normalized)
<AxesSubplot:title={'center':'Frequency variable distribution within each cluster'}, xlabel='Cluster', ylabel='Frequency'>
plt.figure(figsize=(12,12))
plt.title("Recency variable distribution within each cluster")
sns.boxplot(x='Cluster', y='Recency', data=rfm_normalized)
<AxesSubplot:title={'center':'Recency variable distribution within each cluster'}, xlabel='Cluster', ylabel='Recency'>
plt.figure(figsize=(12,12))
plt.title("Clustering: Recency vs Monetary")
RM=sns.scatterplot(x='Recency', y='Monetary', hue='Cluster', palette="Set2", data=rfm_normalized)
plt.figure(figsize=(12,12))
plt.title("Clustering: Frequency vs Monetary")
FM=sns.scatterplot(x='Frequency', y='Monetary', hue='Cluster',palette="Set2", data=rfm_normalized)
plt.figure(figsize=(12,12))
plt.title("Clustering: Recency vs Frequency")
RF=sns.scatterplot(x='Recency', y='Frequency', hue='Cluster', palette="Set2", data=rfm_normalized)
3D representation¶
plt.rcParams["figure.figsize"] = (25,25)
fig = plt.figure(1)
plt.clf()
ax = Axes3D(fig, rect = [0, 0, .95, 1],
            elev = 48,
            azim = 134)

plt.cla()
ax.scatter(rfm_normalized['Frequency'], rfm_normalized['Recency'], rfm_normalized['Monetary'],
           c = rfm_normalized['Cluster'],
           s = 200,
           cmap = "spring",
           alpha = 0.5,
           edgecolor = 'darkgrey')

ax.set_xlabel('Frequency',
              fontsize = 16)
ax.set_ylabel('Recency',
              fontsize = 16)
ax.set_zlabel('Monetary',
              fontsize = 16)

plt.show()
# import plotly.express as px

# fig2 = px.scatter_3d(rfm_normalized, x="Frequency", y="Recency", z="Monetary", color="Cluster",)
# fig2.update_layout(title="3 Features Representation")
# fig2.show()
C:\Users\glori\AppData\Local\Temp\ipykernel_22296\408342758.py:4: MatplotlibDeprecationWarning:

Axes3D(fig) adding itself to the figure is deprecated since 3.4. Pass the keyword argument auto_add_to_figure=False and use fig.add_axes(ax) to suppress this warning. The default value of auto_add_to_figure will change to False in mpl3.5 and True values will no longer work in 3.6.  This is consistent with other Axes classes.

D. Artificial Neural Network¶

Sample of independent variables (RFM values)¶
#Selection and distribution of independent and dependent variables
X = rfm_normalized.iloc[:,:3].values
X[:10]
array([[ 7.28288079e-02,  1.07093859e-01, -9.05228987e-01],
       [-1.97274636e-02, -2.57655586e-02, -1.76467627e-01],
       [-2.11894251e-02, -1.58624976e-01, -7.35517437e-01],
       [-7.36104514e-02, -1.58624976e-01,  2.16954497e+00],
       [ 1.94853772e-03,  1.07093859e-01, -5.65805888e-01],
       [-8.26496390e-02, -1.58624976e-01,  1.11134354e+00],
       [-4.61687449e-02, -1.58624976e-01,  1.39086845e+00],
       [-6.90061382e-02, -1.58624976e-01,  1.21117387e+00],
       [ 1.76297233e-02, -7.00520310e-02, -6.95585308e-01],
       [ 1.42728544e-01, -1.58624976e-01, -5.95754985e-01]])
Sample ependent variables(Cluster labels)¶
Y = rfm_normalized.iloc[:,4:5].values
Y[:10]
array([[1],
       [1],
       [1],
       [0],
       [1],
       [3],
       [0],
       [3],
       [1],
       [1]])
One hot encoding on the cluster labels¶
#One hot encode
ohe = OneHotEncoder()
Y = ohe.fit_transform(Y).toarray()
print('One hot encoded array:')
print(Y[0:5])
One hot encoded array:
[[0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]]
Split data set into training and testing data¶
#Train test split of model
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.1,random_state = 0)
print("Done!")
Done!
Define the model¶
#Defining the model
model = Sequential()
model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], activation='relu')) #Input layer with 16 units
model.add(Dense(4, activation='relu')) #Hidden layer with 12 units
model.add(Dense(4, activation='softmax'))#Middle layer with 4 units
model.summary()
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 3)                 12        
                                                                 
 dense_1 (Dense)             (None, 4)                 16        
                                                                 
 dense_2 (Dense)             (None, 4)                 20        
                                                                 
=================================================================
Total params: 48
Trainable params: 48
Non-trainable params: 0
_________________________________________________________________
model.layers[0].output
<KerasTensor: shape=(None, 3) dtype=float32 (created by layer 'dense')>
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
Training model¶
history = model.fit(X_train, Y_train, epochs=100, batch_size=64, validation_split = 0.1)
Epoch 1/100
55/55 [==============================] - 1s 4ms/step - loss: 1.3546 - accuracy: 0.8011 - val_loss: 1.2821 - val_accuracy: 0.7923
Epoch 2/100
55/55 [==============================] - 0s 1ms/step - loss: 1.2192 - accuracy: 0.7891 - val_loss: 1.1436 - val_accuracy: 0.7795
Epoch 3/100
55/55 [==============================] - 0s 1ms/step - loss: 1.0784 - accuracy: 0.8428 - val_loss: 0.9937 - val_accuracy: 0.8590
Epoch 4/100
55/55 [==============================] - 0s 1ms/step - loss: 0.9182 - accuracy: 0.8388 - val_loss: 0.8030 - val_accuracy: 0.8231
Epoch 5/100
55/55 [==============================] - 0s 1ms/step - loss: 0.7191 - accuracy: 0.8111 - val_loss: 0.6066 - val_accuracy: 0.8231
Epoch 6/100
55/55 [==============================] - 0s 1ms/step - loss: 0.5581 - accuracy: 0.8216 - val_loss: 0.4743 - val_accuracy: 0.8333
Epoch 7/100
55/55 [==============================] - 0s 1ms/step - loss: 0.4525 - accuracy: 0.8330 - val_loss: 0.3901 - val_accuracy: 0.8462
Epoch 8/100
55/55 [==============================] - 0s 2ms/step - loss: 0.3850 - accuracy: 0.8539 - val_loss: 0.3365 - val_accuracy: 0.8564
Epoch 9/100
55/55 [==============================] - 0s 1ms/step - loss: 0.3400 - accuracy: 0.8750 - val_loss: 0.2998 - val_accuracy: 0.8795
Epoch 10/100
55/55 [==============================] - 0s 1ms/step - loss: 0.3075 - accuracy: 0.8990 - val_loss: 0.2724 - val_accuracy: 0.8974
Epoch 11/100
55/55 [==============================] - 0s 1ms/step - loss: 0.2826 - accuracy: 0.9072 - val_loss: 0.2519 - val_accuracy: 0.9051
Epoch 12/100
55/55 [==============================] - 0s 1ms/step - loss: 0.2625 - accuracy: 0.9178 - val_loss: 0.2345 - val_accuracy: 0.9179
Epoch 13/100
55/55 [==============================] - 0s 1ms/step - loss: 0.2451 - accuracy: 0.9275 - val_loss: 0.2194 - val_accuracy: 0.9231
Epoch 14/100
55/55 [==============================] - 0s 2ms/step - loss: 0.2298 - accuracy: 0.9418 - val_loss: 0.2060 - val_accuracy: 0.9308
Epoch 15/100
55/55 [==============================] - 0s 2ms/step - loss: 0.2156 - accuracy: 0.9449 - val_loss: 0.1943 - val_accuracy: 0.9385
Epoch 16/100
55/55 [==============================] - 0s 2ms/step - loss: 0.2029 - accuracy: 0.9526 - val_loss: 0.1830 - val_accuracy: 0.9436
Epoch 17/100
55/55 [==============================] - 0s 2ms/step - loss: 0.1911 - accuracy: 0.9592 - val_loss: 0.1730 - val_accuracy: 0.9410
Epoch 18/100
55/55 [==============================] - 0s 2ms/step - loss: 0.1805 - accuracy: 0.9603 - val_loss: 0.1638 - val_accuracy: 0.9436
Epoch 19/100
55/55 [==============================] - 0s 1ms/step - loss: 0.1700 - accuracy: 0.9626 - val_loss: 0.1551 - val_accuracy: 0.9590
Epoch 20/100
55/55 [==============================] - 0s 2ms/step - loss: 0.1608 - accuracy: 0.9663 - val_loss: 0.1473 - val_accuracy: 0.9615
Epoch 21/100
55/55 [==============================] - 0s 1ms/step - loss: 0.1523 - accuracy: 0.9735 - val_loss: 0.1404 - val_accuracy: 0.9564
Epoch 22/100
55/55 [==============================] - 0s 1ms/step - loss: 0.1442 - accuracy: 0.9706 - val_loss: 0.1333 - val_accuracy: 0.9769
Epoch 23/100
55/55 [==============================] - 0s 1ms/step - loss: 0.1369 - accuracy: 0.9792 - val_loss: 0.1273 - val_accuracy: 0.9795
Epoch 24/100
55/55 [==============================] - 0s 1ms/step - loss: 0.1300 - accuracy: 0.9752 - val_loss: 0.1216 - val_accuracy: 0.9795
Epoch 25/100
55/55 [==============================] - 0s 1ms/step - loss: 0.1237 - accuracy: 0.9812 - val_loss: 0.1173 - val_accuracy: 0.9641
Epoch 26/100
55/55 [==============================] - 0s 1ms/step - loss: 0.1184 - accuracy: 0.9792 - val_loss: 0.1117 - val_accuracy: 0.9821
Epoch 27/100
55/55 [==============================] - 0s 1ms/step - loss: 0.1134 - accuracy: 0.9817 - val_loss: 0.1074 - val_accuracy: 0.9923
Epoch 28/100
55/55 [==============================] - 0s 1ms/step - loss: 0.1090 - accuracy: 0.9837 - val_loss: 0.1033 - val_accuracy: 0.9923
Epoch 29/100
55/55 [==============================] - 0s 1ms/step - loss: 0.1048 - accuracy: 0.9846 - val_loss: 0.0997 - val_accuracy: 0.9923
Epoch 30/100
55/55 [==============================] - 0s 1ms/step - loss: 0.1008 - accuracy: 0.9872 - val_loss: 0.0962 - val_accuracy: 0.9923
Epoch 31/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0972 - accuracy: 0.9834 - val_loss: 0.0931 - val_accuracy: 0.9923
Epoch 32/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0944 - accuracy: 0.9883 - val_loss: 0.0901 - val_accuracy: 0.9923
Epoch 33/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0912 - accuracy: 0.9883 - val_loss: 0.0873 - val_accuracy: 0.9949
Epoch 34/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0885 - accuracy: 0.9877 - val_loss: 0.0850 - val_accuracy: 0.9949
Epoch 35/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0860 - accuracy: 0.9886 - val_loss: 0.0823 - val_accuracy: 0.9949
Epoch 36/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0835 - accuracy: 0.9894 - val_loss: 0.0807 - val_accuracy: 0.9923
Epoch 37/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0814 - accuracy: 0.9886 - val_loss: 0.0780 - val_accuracy: 0.9974
Epoch 38/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0792 - accuracy: 0.9894 - val_loss: 0.0762 - val_accuracy: 0.9949
Epoch 39/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0774 - accuracy: 0.9903 - val_loss: 0.0743 - val_accuracy: 0.9949
Epoch 40/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0755 - accuracy: 0.9886 - val_loss: 0.0728 - val_accuracy: 0.9974
Epoch 41/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0737 - accuracy: 0.9892 - val_loss: 0.0710 - val_accuracy: 0.9974
Epoch 42/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0722 - accuracy: 0.9894 - val_loss: 0.0694 - val_accuracy: 0.9974
Epoch 43/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0705 - accuracy: 0.9903 - val_loss: 0.0684 - val_accuracy: 0.9974
Epoch 44/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0688 - accuracy: 0.9886 - val_loss: 0.0664 - val_accuracy: 0.9949
Epoch 45/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0676 - accuracy: 0.9892 - val_loss: 0.0654 - val_accuracy: 0.9974
Epoch 46/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0661 - accuracy: 0.9892 - val_loss: 0.0638 - val_accuracy: 0.9949
Epoch 47/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0651 - accuracy: 0.9900 - val_loss: 0.0629 - val_accuracy: 0.9974
Epoch 48/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0638 - accuracy: 0.9906 - val_loss: 0.0616 - val_accuracy: 0.9949
Epoch 49/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0625 - accuracy: 0.9903 - val_loss: 0.0606 - val_accuracy: 0.9974
Epoch 50/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0612 - accuracy: 0.9917 - val_loss: 0.0600 - val_accuracy: 0.9974
Epoch 51/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0602 - accuracy: 0.9897 - val_loss: 0.0586 - val_accuracy: 0.9974
Epoch 52/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0591 - accuracy: 0.9897 - val_loss: 0.0577 - val_accuracy: 0.9949
Epoch 53/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0579 - accuracy: 0.9906 - val_loss: 0.0570 - val_accuracy: 0.9974
Epoch 54/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0568 - accuracy: 0.9906 - val_loss: 0.0556 - val_accuracy: 0.9974
Epoch 55/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0559 - accuracy: 0.9912 - val_loss: 0.0549 - val_accuracy: 0.9974
Epoch 56/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0548 - accuracy: 0.9906 - val_loss: 0.0537 - val_accuracy: 0.9949
Epoch 57/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0540 - accuracy: 0.9906 - val_loss: 0.0532 - val_accuracy: 0.9974
Epoch 58/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0530 - accuracy: 0.9906 - val_loss: 0.0520 - val_accuracy: 0.9949
Epoch 59/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0523 - accuracy: 0.9906 - val_loss: 0.0516 - val_accuracy: 0.9974
Epoch 60/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0514 - accuracy: 0.9909 - val_loss: 0.0511 - val_accuracy: 0.9974
Epoch 61/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0506 - accuracy: 0.9903 - val_loss: 0.0502 - val_accuracy: 0.9974
Epoch 62/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0499 - accuracy: 0.9906 - val_loss: 0.0502 - val_accuracy: 0.9974
Epoch 63/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0489 - accuracy: 0.9897 - val_loss: 0.0493 - val_accuracy: 0.9974
Epoch 64/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0485 - accuracy: 0.9909 - val_loss: 0.0478 - val_accuracy: 0.9974
Epoch 65/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0474 - accuracy: 0.9917 - val_loss: 0.0474 - val_accuracy: 0.9974
Epoch 66/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0468 - accuracy: 0.9906 - val_loss: 0.0466 - val_accuracy: 0.9974
Epoch 67/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0461 - accuracy: 0.9903 - val_loss: 0.0465 - val_accuracy: 0.9974
Epoch 68/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0452 - accuracy: 0.9903 - val_loss: 0.0451 - val_accuracy: 0.9949
Epoch 69/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0448 - accuracy: 0.9909 - val_loss: 0.0449 - val_accuracy: 0.9974
Epoch 70/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0439 - accuracy: 0.9909 - val_loss: 0.0444 - val_accuracy: 0.9974
Epoch 71/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0434 - accuracy: 0.9912 - val_loss: 0.0434 - val_accuracy: 0.9949
Epoch 72/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0427 - accuracy: 0.9929 - val_loss: 0.0428 - val_accuracy: 0.9949
Epoch 73/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0423 - accuracy: 0.9897 - val_loss: 0.0428 - val_accuracy: 0.9974
Epoch 74/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0416 - accuracy: 0.9903 - val_loss: 0.0420 - val_accuracy: 0.9974
Epoch 75/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0411 - accuracy: 0.9912 - val_loss: 0.0412 - val_accuracy: 0.9949
Epoch 76/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0404 - accuracy: 0.9912 - val_loss: 0.0409 - val_accuracy: 0.9923
Epoch 77/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0400 - accuracy: 0.9912 - val_loss: 0.0403 - val_accuracy: 0.9949
Epoch 78/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0394 - accuracy: 0.9914 - val_loss: 0.0401 - val_accuracy: 0.9923
Epoch 79/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0388 - accuracy: 0.9917 - val_loss: 0.0393 - val_accuracy: 0.9949
Epoch 80/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0382 - accuracy: 0.9906 - val_loss: 0.0391 - val_accuracy: 0.9974
Epoch 81/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0376 - accuracy: 0.9917 - val_loss: 0.0387 - val_accuracy: 0.9974
Epoch 82/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0373 - accuracy: 0.9920 - val_loss: 0.0393 - val_accuracy: 0.9974
Epoch 83/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0367 - accuracy: 0.9914 - val_loss: 0.0381 - val_accuracy: 0.9974
Epoch 84/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0363 - accuracy: 0.9917 - val_loss: 0.0370 - val_accuracy: 0.9949
Epoch 85/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0357 - accuracy: 0.9923 - val_loss: 0.0365 - val_accuracy: 0.9949
Epoch 86/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0353 - accuracy: 0.9906 - val_loss: 0.0362 - val_accuracy: 0.9974
Epoch 87/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0350 - accuracy: 0.9909 - val_loss: 0.0356 - val_accuracy: 0.9949
Epoch 88/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0349 - accuracy: 0.9906 - val_loss: 0.0353 - val_accuracy: 0.9974
Epoch 89/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0340 - accuracy: 0.9912 - val_loss: 0.0359 - val_accuracy: 0.9974
Epoch 90/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0337 - accuracy: 0.9914 - val_loss: 0.0349 - val_accuracy: 0.9974
Epoch 91/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0333 - accuracy: 0.9909 - val_loss: 0.0349 - val_accuracy: 0.9974
Epoch 92/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0328 - accuracy: 0.9917 - val_loss: 0.0336 - val_accuracy: 0.9949
Epoch 93/100
55/55 [==============================] - 0s 2ms/step - loss: 0.0324 - accuracy: 0.9912 - val_loss: 0.0337 - val_accuracy: 0.9974
Epoch 94/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0319 - accuracy: 0.9923 - val_loss: 0.0330 - val_accuracy: 0.9949
Epoch 95/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0318 - accuracy: 0.9909 - val_loss: 0.0327 - val_accuracy: 0.9949
Epoch 96/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0313 - accuracy: 0.9897 - val_loss: 0.0333 - val_accuracy: 0.9974
Epoch 97/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0313 - accuracy: 0.9903 - val_loss: 0.0336 - val_accuracy: 0.9974
Epoch 98/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0307 - accuracy: 0.9906 - val_loss: 0.0322 - val_accuracy: 0.9974
Epoch 99/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0303 - accuracy: 0.9906 - val_loss: 0.0324 - val_accuracy: 0.9974
Epoch 100/100
55/55 [==============================] - 0s 1ms/step - loss: 0.0300 - accuracy: 0.9917 - val_loss: 0.0318 - val_accuracy: 0.9974

EVALUATION¶

Evaluate model on test data¶
print("Predicting.....")
y_pred = model.predict(X_test)
#Converting predictions to label
pred = list()
for i in range(len(y_pred)):
    pred.append(np.argmax(y_pred[i]))
print("Done!")
Predicting.....
14/14 [==============================] - 0s 850us/step
Done!
Reverse one hot encoding process¶
#Converting one hot encoded test label to label
test = list()
for i in range(len(Y_test)):
    test.append(np.argmax(Y_test[i]))
print("Done!")
Done!
Accuracy of mode on test data (%)¶
print()
print("Pridicted values")
pred[:10]
Pridicted values
[3, 1, 0, 3, 1, 1, 3, 1, 1, 1]
print()
print("Actual values")
test[:10]
Actual values
[3, 1, 0, 3, 1, 1, 3, 1, 1, 1]
a = accuracy_score(pred,test)
print('Accuracy of model on test data is:', a*100)
Accuracy of model on test data is: 99.76905311778292
Visualize the accurracy and loss of training and validation data¶
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
Save model¶
import pickle

print("Saving model....");
pickle.dump(model, open('Behavioural_Segmentation_model.pkl','wb'))
print("Model saved");
Saving model....
INFO:tensorflow:Assets written to: ram://cb2ad320-36ba-4252-a332-5f6337684eb0/assets
Model saved
#Function to predict customer group from user input
def run_model(Recency_pred, Frequency_pred, Monetary_pred):
    pred_data = {'Monetary':[Monetary_pred],'Frequency':[Frequency_pred], 'Recency':[Recency_pred]}
    pred_data=pd.DataFrame(pred_data)
    print()
    print("UNSCALED DATA SET")
    print(pred_data)
    pred_data = pd.DataFrame(scaler.transform(pred_data), columns=pred_data.columns)
    print()
    print("SCALED DATA SET")
    print(pred_data)
    print()
    print("PREDICTING......")
    X1=np.array(pred_data)
    print(X1)
    result = model.predict(X1)
    k = list()
    for i in range(len(result)):
        k.append(np.argmax(result[i]))
    out = Output(layout={'border': '1px solid white'})

    with out:
        print(k)

    return out

PREDICTING NEW VALUES¶

Please input a users recenncy, frequency and monetary values¶
interact(run_model, Recency_pred = BoundedFloatText(value = 0, min = 0, max=373, step = 1, description = 'Recency'),
Frequency_pred = BoundedFloatText(value = 0, min = 1, step = 1, max=1402, description = 'Frequency'),
Monetary_pred = BoundedFloatText(value = 2.9, min = 2.9, step    = 1,max=279765.02,  description = 'Monetary'),)
interactive(children=(BoundedFloatText(value=0.0, description='Recency', max=373.0, step=1.0), BoundedFloatTex…
<function __main__.run_model(Recency_pred, Frequency_pred, Monetary_pred)>